library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(gridExtra)
library(pheatmap)
library(UpSetR)
library(ensembldb)
library(apeglm)
library(ashr)
DB <- "EnsDb" # Assing your species
AnnotationSpecies <- "mus musculus" # Assign your species
ah <- AnnotationHub(hub=getAnnotationHubOption("URL")) # Connect to annotation DB
# Filter annotation of interest
ahQuery <- query(ah,
pattern=c(DB, AnnotationSpecies),
ignore.case=T)
# Select the most recent data
DBName <- mcols(ahQuery) %>%
rownames() %>%
tail(1)
AnnoDb <- ah[[DBName]]
# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts
AnnoDb <- select(AnnoDb,
AnnoKey,
keytype="TXID",
columns=c("GENEID", "GENENAME"))
# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
## TXID GENEID GENENAME
## 1 ENSMUST00000000001 ENSMUSG00000000001 Gnai3
## 2 ENSMUST00000000003 ENSMUSG00000000003 Pbsn
## 3 ENSMUST00000000010 ENSMUSG00000020875 Hoxb9
## 4 ENSMUST00000000028 ENSMUSG00000000028 Cdc45
## 5 ENSMUST00000000033 ENSMUSG00000048583 Igf2
## 6 ENSMUST00000000049 ENSMUSG00000000049 Apoh
.sf files have been created from fastq data by salmon
# This code chunk needs to be written by yourself
#
# Define sample names
SampleNames <- c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3))
# Define group level
GroupLevel <- c("WT", "cKO")
# Define contrast for DE analysis
Contrast <- c("Group", "cKO", "WT")
# Define a vector for comparing TPM vs Counts effect
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC
# Define .sf file path
sf <- c(paste0("../salmon_output/",
SampleNames,
".salmon_quant/quant.sf"))
# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
Group=factor(group, levels=GroupLevel),
Path=sf)
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
## Sample Group Path
## WT-rep1 WT-rep1 WT ../salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2 WT-rep2 WT ../salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3 WT-rep3 WT ../salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1 cKO ../salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2 cKO ../salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3 cKO ../salmon_output/cKO-rep3.salmon_quant/quant.sf
# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames
# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
countsFromAbundance="lengthScaledTPM", # Extracts TPM
ignoreTxVersion=T)
txi_counts <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
ignoreTxVersion=T)
# tpm
head(txi_tpm$counts)
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2
## ENSMUSG00000000001 6572.14450 5800.14784 5642.53666 5996.0868676 6077.604244
## ENSMUSG00000000003 0.00000 0.00000 0.00000 0.0000000 0.000000
## ENSMUSG00000000028 3917.45158 3550.56247 3714.75413 3422.1658384 3407.098630
## ENSMUSG00000000031 25.53275 61.51501 45.35886 9.5023217 38.694556
## ENSMUSG00000000037 85.32325 75.78921 107.66451 138.5489706 170.156259
## ENSMUSG00000000049 0.00000 0.00000 0.00000 0.9982156 1.995162
## cKO-rep3
## ENSMUSG00000000001 5349.93335
## ENSMUSG00000000003 0.00000
## ENSMUSG00000000028 3121.96322
## ENSMUSG00000000031 95.50979
## ENSMUSG00000000037 137.21541
## ENSMUSG00000000049 0.00000
dim(txi_tpm$counts)
## [1] 51956 6
# counts
head(txi_counts$counts)
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6492.000 5822.000 5614.000 6040.00 6142.00 5437.000
## ENSMUSG00000000003 0.000 0.000 0.000 0.00 0.00 0.000
## ENSMUSG00000000028 3904.066 3567.989 3714.192 3410.46 3463.04 3142.454
## ENSMUSG00000000031 33.000 45.000 58.000 12.00 28.00 70.000
## ENSMUSG00000000037 119.001 90.001 82.000 165.00 129.00 100.000
## ENSMUSG00000000049 0.000 0.000 0.000 1.00 2.00 0.000
dim(txi_counts$counts)
## [1] 51956 6
# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) {
# Create a DESeq object (so-calledd dds)
des <- DESeqDataSetFromTximport(txi,
colData=metadata,
design=~Group)
# Create a vsd object (so-called vsd)
ves <- vst(des, blind=T)
# Output them as a list
return(list(dds=des, vsd=ves))
}
TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']]
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]
# TPM
TPM[[1]]
## class: DESeqDataSet
## dim: 51956 6
## metadata(1): version
## assays(1): counts
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6572 5800 5643 5996 6078 5350
## ENSMUSG00000000003 0 0 0 0 0 0
## ENSMUSG00000000028 3917 3551 3715 3422 3407 3122
## ENSMUSG00000000031 26 62 45 10 39 96
## ENSMUSG00000000037 85 76 108 139 170 137
## ENSMUSG00000000049 0 0 0 1 2 0
# Counts
Counts[[1]]
## class: DESeqDataSet
## dim: 51956 6
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6492 5822 5614 6040 6142 5437
## ENSMUSG00000000003 0 0 0 0 0 0
## ENSMUSG00000000028 3904 3568 3714 3410 3463 3142
## ENSMUSG00000000031 33 45 58 12 28 70
## ENSMUSG00000000037 119 90 82 165 129 100
## ENSMUSG00000000049 0 0 0 1 2 0
# TPM
TPM[[2]]
## class: DESeqTransform
## dim: 51956 6
## metadata(1): version
## assays(1): ''
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform
## dim: 51956 6
## metadata(1): version
## assays(1): ''
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
List[[1]] <- estimateSizeFactors(List[[1]])
List[[1]] <- estimateDispersions(List[[1]])
List[[1]] <- nbinomWaldTest(List[[1]])
return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts)
TPM <- DESeqPrep_fn(TPM)
sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1.1099245 0.9997388 0.9771411 1.0203120 1.0318653 0.8935969
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead!
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
## Sample Group Path
## <factor> <factor> <character>
## WT-rep1 WT-rep1 WT ../salmon_output/WT-..
## WT-rep2 WT-rep2 WT ../salmon_output/WT-..
## WT-rep3 WT-rep3 WT ../salmon_output/WT-..
## cKO-rep1 cKO-rep1 cKO ../salmon_output/cKO..
## cKO-rep2 cKO-rep2 cKO ../salmon_output/cKO..
## cKO-rep3 cKO-rep3 cKO ../salmon_output/cKO..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
## Sample Group Path sizeFactor
## <factor> <factor> <character> <numeric>
## WT-rep1 WT-rep1 WT ../salmon_output/WT-.. 1.109925
## WT-rep2 WT-rep2 WT ../salmon_output/WT-.. 0.999739
## WT-rep3 WT-rep3 WT ../salmon_output/WT-.. 0.977141
## cKO-rep1 cKO-rep1 cKO ../salmon_output/cKO.. 1.020312
## cKO-rep2 cKO-rep2 cKO ../salmon_output/cKO.. 1.031865
## cKO-rep3 cKO-rep3 cKO ../salmon_output/cKO.. 0.893597
# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))
colnames(sizeFactor) <- 'Size_Factor'
sizeFactor <- sizeFactor %>%
rownames_to_column(var="Sample") %>%
inner_join(metadata[, 1:ncol(metadata)-1], by="Sample")
sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)
# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample,
y=Size_Factor,
fill=Group,
label=Size_Factor)) +
geom_bar(stat="identity", width=0.8) +
theme_bw() +
ggtitle("Size Factors in TPM-DESeq") +
geom_text(vjust=1.5) +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")
# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
gather(Sample, Normalization_Factor) %>%
inner_join(metadata[, 1:2], by="Sample")
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors
normFactors_plot <- ggplot(normf,
aes(x=Sample, y=Normalization_Factor)) +
geom_jitter(alpha=0.5, aes(color=Group)) +
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor),
outlier.shape=NA, alpha=0.5) +
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) +
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1,
color="blue",
linetype="dashed")
# Print the normalization factor scatter plot
print(normFactors_plot)
# Print the same plot with larger y-magnification in order to observe the box plot
normFactors_plot +
ylim(0.8, 1.2)
# Assigne what to compare
GroupOfInterest <- Contrast[1]
# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {
plotPCA(inputList[[2]], # takes vsd
intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}
# Set heatmap annotation
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]
# Set a function for a correlation heatmap
QCcorrHeatmap_fn <- function(inputList, Title) {
# Extract transformed count matrix
mtx <- assay(inputList[[2]]) # takes vsd
# Calculate correlation and store in the matrix
mtx <- cor(mtx)
# Create a correlation heatmap
return(pheatmap(mtx,
annotation=HeatmapAnno,
main=paste("Sample Correlation Heatmap:",
Title)))
}
grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"),
QCPCA_fn(Counts, "QC PCA: Counts"),
nrow=2)
# TPM
QCcorrHeatmap_fn(TPM, "TPM")
# Counts
QCcorrHeatmap_fn(Counts, "Counts")
# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]])
names(desList) <- TvC
# Create a list for TPM and Counts dds
ddsList <- desList # DE without shrinkage
normal.ddsList <- desList # DE with "normal" shrinkage
ape.ddsList <- desList # DE with "apeglm" shrinkage
ash.ddsList <- desList # DE with "ashr" shrinkage
for (x in TvC) {
# Run DESeq()
ddsList[[x]] <- DESeq(desList[[x]])
print(resultsNames(ddsList[[x]]))
normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
contrast=Contrast,
type="normal")
ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="apeglm")
ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="ashr")
}
## [1] "Intercept" "Group_cKO_vs_WT"
## [1] "Intercept" "Group_cKO_vs_WT"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage
# Plot dispersion
for (x in TvC) {
plotDispEsts(ddsList[[x]],
ylab="Dispersion",
xlab="Mean of Normalized Counts",
main=paste("Dispersion of", x, "Input"))
}
# Set FDR threshold
alpha=0.1
# FDR threshold vector
FDRv=c("< 0.1", "> 0.1")
# Initialize lists of result tables
all.resList <- all.ddsList
# Set a function cleaning table
Sig.fn <- function(df, Input) {
df <- df %>%
rownames_to_column(var="Gene") %>%
mutate(FDR=ifelse(padj < 0.1 & !is.na(padj),
FDRv[1],
FDRv[2]),
Input=Input)
return(df)
}
for (i in shrinkage) {
if (i == "None") {
for (x in TvC) {
# Extract data frames from unshrunken lfc & clean data
all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]],
contrast=Contrast,
alpha=alpha)) %>%
Sig.fn(x)
}
} else {
# Extract data frames from shrunken lfc & clean data
for (x in TvC) {
all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
Sig.fn(x)
}
}
}
# Explore the results
summary(all.resList)
## Length Class Mode
## None 2 -none- list
## Normal 2 -none- list
## Apeglm 2 -none- list
## Ashr 2 -none- list
head(all.resList[[1]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5875.2710822 0.0208662 0.05803844 0.3595238
## 2 ENSMUSG00000000003 0.0000000 NA NA NA
## 3 ENSMUSG00000000028 3505.3855243 -0.1007121 0.07316717 -1.3764651
## 4 ENSMUSG00000000031 47.7535803 0.2351843 0.73316295 0.3207804
## 5 ENSMUSG00000000037 119.5706931 0.7900807 0.24416906 3.2357939
## 6 ENSMUSG00000000049 0.4863883 2.4485413 3.94664218 0.6204113
## pvalue padj FDR Input
## 1 0.719203270 0.94846055 > 0.1 TPM
## 2 NA NA > 0.1 TPM
## 3 0.168677681 0.60188898 > 0.1 TPM
## 4 0.748376798 0.95470956 > 0.1 TPM
## 5 0.001213049 0.03148688 < 0.1 TPM
## 6 0.534987059 NA > 0.1 TPM
head(all.resList[[1]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5914.7570322 0.02115354 0.05853484 0.3613837
## 2 ENSMUSG00000000003 0.0000000 NA NA NA
## 3 ENSMUSG00000000028 3528.9626791 -0.10046859 0.07347464 -1.3673913
## 4 ENSMUSG00000000031 45.9582159 0.22318728 0.74187581 0.3008418
## 5 ENSMUSG00000000037 116.2530810 0.80391955 0.24247966 3.3154103
## 6 ENSMUSG00000000049 0.4886776 2.45554210 3.85387491 0.6371619
## pvalue padj FDR Input
## 1 0.7178126394 0.94593163 > 0.1 Counts
## 2 NA NA > 0.1 Counts
## 3 0.1715026994 0.60733329 > 0.1 Counts
## 4 0.7635351064 0.95710414 > 0.1 Counts
## 5 0.0009150871 0.02512149 < 0.1 Counts
## 6 0.5240194188 NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5875.2710822 0.02033329 0.05655623 0.3595238
## 2 ENSMUSG00000000003 0.0000000 NA NA NA
## 3 ENSMUSG00000000028 3505.3855243 -0.09668461 0.07024128 -1.3764651
## 4 ENSMUSG00000000031 47.7535803 0.04527843 0.14148646 0.3207804
## 5 ENSMUSG00000000037 119.5706931 0.53955719 0.16664524 3.2357939
## 6 ENSMUSG00000000049 0.4863883 0.02254895 0.03170670 0.6204113
## pvalue padj FDR Input
## 1 0.719203270 0.94846055 > 0.1 TPM
## 2 NA NA > 0.1 TPM
## 3 0.168677681 0.60188898 > 0.1 TPM
## 4 0.748376798 0.95470956 > 0.1 TPM
## 5 0.001213049 0.03148688 < 0.1 TPM
## 6 0.534987059 NA > 0.1 TPM
head(all.resList[[2]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5914.7570322 0.02060873 0.05702735 0.3613837
## 2 ENSMUSG00000000003 0.0000000 NA NA NA
## 3 ENSMUSG00000000028 3528.9626791 -0.09645101 0.07053657 -1.3673913
## 4 ENSMUSG00000000031 45.9582159 0.04223396 0.14140232 0.3008418
## 5 ENSMUSG00000000037 116.2530810 0.55189675 0.16671595 3.3154103
## 6 ENSMUSG00000000049 0.4886776 0.02385755 0.03270373 0.6371619
## pvalue padj FDR Input
## 1 0.7178126394 0.94593163 > 0.1 Counts
## 2 NA NA > 0.1 Counts
## 3 0.1715026994 0.60733329 > 0.1 Counts
## 4 0.7635351064 0.95710414 > 0.1 Counts
## 5 0.0009150871 0.02512149 < 0.1 Counts
## 6 0.5240194188 NA > 0.1 Counts
# Set ylim: has to adjusted by users depending on data
yl <- c(-7.5, 7.5)
# Set min log2 fold change of interest
mLog <- c(-1, 1)
# Initialize a list storing MA plots
MAList <- ddsList
# Create MA plots
for (i in shrinkage) {
both.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
MAList[[i]] <- ggplot(both.df,
aes(x=baseMean, y=log2FoldChange, color=FDR)) +geom_point() + scale_x_log10() + facet_grid(~Input) +
theme_bw() +
scale_color_manual(values=c("blue", "grey")) +
ggtitle(paste("MA plot with", i)) +
ylim(yl[1], yl[2]) +
geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red")
}
# Print MA plots
MAList
## $TPM
## class: DESeqDataSet
## dim: 51956 6
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
##
## $Counts
## class: DESeqDataSet
## dim: 51956 6
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
##
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']],
all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)
# Plot distribution of FDR
ggplot(both.df,
aes(x=padj, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") +
xlab("Adjusted P-Value (FDR)") +
ylab("Count") +
geom_vline(xintercept=alpha,
color="black",
linetype="dashed",
size=1) +
scale_x_continuous(breaks=seq(0, 1, by=0.1))
# Initialize a lfc list
lfcplotList <- all.resList
# Create lfc distribution plots
for (i in shrinkage) {
lfc.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]
lfc.df$Input <- factor(lfc.df$Input, levels=TvC)
lfcplotList[[i]] <- ggplot(lfc.df, # Subset rows with FDR < alpha
aes(x=log2FoldChange, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() + ylab("Count") +
geom_vline(xintercept=c(mLog[1], mLog[2]),
color="black",
linetype="dashed",
size=1) +
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) +
xlim(-5, 5)
}
# Print the lfc plots
lfcplotList
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Count number of NA genes
type=c("Zero Counts", "Outliers", "Total NA Genes")
# Create a data frame storing NA gene number
NAstat <- both.df %>%
group_by(Input) %>%
summarize(zero=sum(is.na(log2FoldChange)),
outlier=sum(is.na(pvalue) & is.na(padj))) %>%
mutate(total=zero + outlier) %>%
gather(Type, Number, -Input) %>%
mutate(Type=factor(case_when(Type == "zero" ~ type[1],
Type == "outlier" ~ type[2],
Type == "total" ~ type[3]),
levels=type))
# Plot number of NA genes
ggplot(NAstat,
aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
geom_text(position=position_dodge(width=1), vjust=1.5) +
ggtitle("Number of NA Genes") +
ylab("Number of Genes")
# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>%
group_by(GENEID) %>%
summarize(num.trans=n_distinct(TXID))
# Create an empty list storing significant gene lfc tables
sigList <- list()
# Filter significant genes' lfc and save in the list
for (x in shrinkage) {
for (y in TvC) {
# Subset genes with FDR below alpha
sigList[[x]][[y]] <- subset(all.resList[[x]][[y]],
FDR == paste("<", alpha))
# Explore the output
print(head(sigList[[x]][[y]]))
print(dim(sigList[[x]][[y]]))
}
}
## Gene baseMean log2FoldChange lfcSE stat
## 5 ENSMUSG00000000037 119.57069 0.7900807 0.24416906 3.235794
## 16 ENSMUSG00000000125 32.58857 1.6829572 0.58565055 2.873654
## 49 ENSMUSG00000000278 2045.98646 0.3824490 0.09661649 3.958423
## 55 ENSMUSG00000000303 17.85736 -1.9802205 0.61593594 -3.214978
## 57 ENSMUSG00000000308 176.27983 0.8913243 0.23443336 3.802037
## 78 ENSMUSG00000000409 16477.95193 -0.2448023 0.06132521 -3.991871
## pvalue padj FDR Input
## 5 1.213049e-03 0.031486877 < 0.1 TPM
## 16 4.057528e-03 0.072140679 < 0.1 TPM
## 49 7.544613e-05 0.003849140 < 0.1 TPM
## 55 1.304545e-03 0.033524391 < 0.1 TPM
## 57 1.435114e-04 0.006551012 < 0.1 TPM
## 78 6.555411e-05 0.003512334 < 0.1 TPM
## [1] 927 9
## Gene baseMean log2FoldChange lfcSE stat
## 5 ENSMUSG00000000037 116.25308 0.8039195 0.24247966 3.315410
## 16 ENSMUSG00000000125 32.76783 1.6985679 0.57789160 2.939250
## 49 ENSMUSG00000000278 2059.81139 0.3823293 0.09661858 3.957099
## 55 ENSMUSG00000000303 17.90961 -1.9770953 0.60846497 -3.249317
## 57 ENSMUSG00000000308 177.45680 0.8927097 0.23256220 3.838585
## 78 ENSMUSG00000000409 16587.89462 -0.2445521 0.06193397 -3.948593
## pvalue padj FDR Input
## 5 9.150871e-04 0.025121492 < 0.1 Counts
## 16 3.290075e-03 0.062778062 < 0.1 Counts
## 49 7.586541e-05 0.003860243 < 0.1 Counts
## 55 1.156826e-03 0.030092651 < 0.1 Counts
## 57 1.237455e-04 0.005807258 < 0.1 Counts
## 78 7.861174e-05 0.003985386 < 0.1 Counts
## [1] 943 9
## Gene baseMean log2FoldChange lfcSE stat
## 5 ENSMUSG00000000037 119.57069 0.5395572 0.16664524 3.235794
## 16 ENSMUSG00000000125 32.58857 0.4491230 0.16093685 2.873654
## 49 ENSMUSG00000000278 2045.98646 0.3565501 0.09007159 3.958423
## 55 ENSMUSG00000000303 17.85736 -0.4992870 0.15972135 -3.214978
## 57 ENSMUSG00000000308 176.27983 0.6242894 0.16406998 3.802037
## 78 ENSMUSG00000000409 16477.95193 -0.2378428 0.05958165 -3.991871
## pvalue padj FDR Input
## 5 1.213049e-03 0.031486877 < 0.1 TPM
## 16 4.057528e-03 0.072140679 < 0.1 TPM
## 49 7.544613e-05 0.003849140 < 0.1 TPM
## 55 1.304545e-03 0.033524391 < 0.1 TPM
## 57 1.435114e-04 0.006551012 < 0.1 TPM
## 78 6.555411e-05 0.003512334 < 0.1 TPM
## [1] 927 9
## Gene baseMean log2FoldChange lfcSE stat
## 5 ENSMUSG00000000037 116.25308 0.5518968 0.16671595 3.315410
## 16 ENSMUSG00000000125 32.76783 0.4653339 0.16288391 2.939250
## 49 ENSMUSG00000000278 2059.81139 0.3566400 0.09012448 3.957099
## 55 ENSMUSG00000000303 17.90961 -0.5117566 0.16164889 -3.249317
## 57 ENSMUSG00000000308 177.45680 0.6298021 0.16394142 3.838585
## 78 ENSMUSG00000000409 16587.89462 -0.2375228 0.06015364 -3.948593
## pvalue padj FDR Input
## 5 9.150871e-04 0.025121492 < 0.1 Counts
## 16 3.290075e-03 0.062778062 < 0.1 Counts
## 49 7.586541e-05 0.003860243 < 0.1 Counts
## 55 1.156826e-03 0.030092651 < 0.1 Counts
## 57 1.237455e-04 0.005807258 < 0.1 Counts
## 78 7.861174e-05 0.003985386 < 0.1 Counts
## [1] 943 9
## Gene baseMean log2FoldChange lfcSE pvalue
## 5 ENSMUSG00000000037 119.57069 0.60077253 0.28973994 1.213049e-03
## 16 ENSMUSG00000000125 32.58857 1.01605413 1.00001036 4.057528e-03
## 49 ENSMUSG00000000278 2045.98646 0.33381180 0.10234699 7.544613e-05
## 55 ENSMUSG00000000303 17.85736 -0.04631661 0.10830675 1.304545e-03
## 57 ENSMUSG00000000308 176.27983 0.74965061 0.25875052 1.435114e-04
## 78 ENSMUSG00000000409 16477.95193 -0.23092435 0.06322695 6.555411e-05
## padj FDR Input
## 5 0.031486877 < 0.1 TPM
## 16 0.072140679 < 0.1 TPM
## 49 0.003849140 < 0.1 TPM
## 55 0.033524391 < 0.1 TPM
## 57 0.006551012 < 0.1 TPM
## 78 0.003512334 < 0.1 TPM
## [1] 927 8
## Gene baseMean log2FoldChange lfcSE pvalue
## 5 ENSMUSG00000000037 116.25308 0.62198670 0.28462912 9.150871e-04
## 16 ENSMUSG00000000125 32.76783 1.09745828 0.86109790 3.290075e-03
## 49 ENSMUSG00000000278 2059.81139 0.33379511 0.10230443 7.586541e-05
## 55 ENSMUSG00000000303 17.90961 -0.04878957 0.11100468 1.156826e-03
## 57 ENSMUSG00000000308 177.45680 0.75437557 0.25589513 1.237455e-04
## 78 ENSMUSG00000000409 16587.89462 -0.21981293 0.06380161 7.861174e-05
## padj FDR Input
## 5 0.025121492 < 0.1 Counts
## 16 0.062778062 < 0.1 Counts
## 49 0.003860243 < 0.1 Counts
## 55 0.030092651 < 0.1 Counts
## 57 0.005807258 < 0.1 Counts
## 78 0.003985386 < 0.1 Counts
## [1] 943 8
## Gene baseMean log2FoldChange lfcSE pvalue
## 5 ENSMUSG00000000037 119.57069 0.4397207 0.32998090 1.213049e-03
## 16 ENSMUSG00000000125 32.58857 0.3244977 0.51639717 4.057528e-03
## 49 ENSMUSG00000000278 2045.98646 0.3099511 0.10336092 7.544613e-05
## 55 ENSMUSG00000000303 17.85736 -0.5020908 0.64094898 1.304545e-03
## 57 ENSMUSG00000000308 176.27983 0.6701575 0.30770701 1.435114e-04
## 78 ENSMUSG00000000409 16477.95193 -0.2151323 0.06143825 6.555411e-05
## padj FDR Input
## 5 0.031486877 < 0.1 TPM
## 16 0.072140679 < 0.1 TPM
## 49 0.003849140 < 0.1 TPM
## 55 0.033524391 < 0.1 TPM
## 57 0.006551012 < 0.1 TPM
## 78 0.003512334 < 0.1 TPM
## [1] 927 8
## Gene baseMean log2FoldChange lfcSE pvalue
## 5 ENSMUSG00000000037 116.25308 0.4743945 0.33019341 9.150871e-04
## 16 ENSMUSG00000000125 32.76783 0.3608942 0.53978349 3.290075e-03
## 49 ENSMUSG00000000278 2059.81139 0.3107556 0.10319337 7.586541e-05
## 55 ENSMUSG00000000303 17.90961 -0.5333992 0.65247640 1.156826e-03
## 57 ENSMUSG00000000308 177.45680 0.6797485 0.30193197 1.237455e-04
## 78 ENSMUSG00000000409 16587.89462 -0.2146729 0.06232476 7.861174e-05
## padj FDR Input
## 5 0.025121492 < 0.1 Counts
## 16 0.062778062 < 0.1 Counts
## 49 0.003860243 < 0.1 Counts
## 55 0.030092651 < 0.1 Counts
## 57 0.005807258 < 0.1 Counts
## 78 0.003985386 < 0.1 Counts
## [1] 943 8
# Clean the data frames with renaming columns
for (x in shrinkage) {
# Join TPM and Counts tables by GENEID
df <- inner_join(sigList[[x]][[1]],
sigList[[x]][[2]],
by="Gene")
# Create a vector storing original column nams
my.colname1 <- colnames(sigList[[x]][[1]])[-1]
# Create a vector storing modified column names
my.colname2 <- c("GENEID",
paste0(my.colname1, "_", TvC[1]),
paste0(my.colname1, "_", TvC[2]))
# Rename the columns
colnames(df) <- my.colname2
# Add a column storing shrinkage method and drop redundant columns
df <- df %>%
mutate(Shrinkage = x) %>%
dplyr::select(-starts_with(c("lfcSE", "stat", "FDR")))
# Save the cleaned data frame in the list
sigList[[x]] <- df
# Explore the output data frame
print(head(df))
print(dim(df))
}
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSMUSG00000000037 119.57069 0.7900807 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125 32.58857 1.6829572 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278 2045.98646 0.3824490 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303 17.85736 -1.9802205 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308 176.27983 0.8913243 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409 16477.95193 -0.2448023 6.555411e-05 0.003512334
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 116.25308 0.8039195 9.150871e-04 0.025121492
## 2 TPM 32.76783 1.6985679 3.290075e-03 0.062778062
## 3 TPM 2059.81139 0.3823293 7.586541e-05 0.003860243
## 4 TPM 17.90961 -1.9770953 1.156826e-03 0.030092651
## 5 TPM 177.45680 0.8927097 1.237455e-04 0.005807258
## 6 TPM 16587.89462 -0.2445521 7.861174e-05 0.003985386
## Input_Counts Shrinkage
## 1 Counts None
## 2 Counts None
## 3 Counts None
## 4 Counts None
## 5 Counts None
## 6 Counts None
## [1] 915 12
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSMUSG00000000037 119.57069 0.5395572 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125 32.58857 0.4491230 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278 2045.98646 0.3565501 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303 17.85736 -0.4992870 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308 176.27983 0.6242894 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409 16477.95193 -0.2378428 6.555411e-05 0.003512334
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 116.25308 0.5518968 9.150871e-04 0.025121492
## 2 TPM 32.76783 0.4653339 3.290075e-03 0.062778062
## 3 TPM 2059.81139 0.3566400 7.586541e-05 0.003860243
## 4 TPM 17.90961 -0.5117566 1.156826e-03 0.030092651
## 5 TPM 177.45680 0.6298021 1.237455e-04 0.005807258
## 6 TPM 16587.89462 -0.2375228 7.861174e-05 0.003985386
## Input_Counts Shrinkage
## 1 Counts Normal
## 2 Counts Normal
## 3 Counts Normal
## 4 Counts Normal
## 5 Counts Normal
## 6 Counts Normal
## [1] 915 12
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSMUSG00000000037 119.57069 0.60077253 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125 32.58857 1.01605413 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278 2045.98646 0.33381180 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303 17.85736 -0.04631661 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308 176.27983 0.74965061 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409 16477.95193 -0.23092435 6.555411e-05 0.003512334
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 116.25308 0.62198670 9.150871e-04 0.025121492
## 2 TPM 32.76783 1.09745828 3.290075e-03 0.062778062
## 3 TPM 2059.81139 0.33379511 7.586541e-05 0.003860243
## 4 TPM 17.90961 -0.04878957 1.156826e-03 0.030092651
## 5 TPM 177.45680 0.75437557 1.237455e-04 0.005807258
## 6 TPM 16587.89462 -0.21981293 7.861174e-05 0.003985386
## Input_Counts Shrinkage
## 1 Counts Apeglm
## 2 Counts Apeglm
## 3 Counts Apeglm
## 4 Counts Apeglm
## 5 Counts Apeglm
## 6 Counts Apeglm
## [1] 915 12
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSMUSG00000000037 119.57069 0.4397207 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125 32.58857 0.3244977 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278 2045.98646 0.3099511 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303 17.85736 -0.5020908 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308 176.27983 0.6701575 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409 16477.95193 -0.2151323 6.555411e-05 0.003512334
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 116.25308 0.4743945 9.150871e-04 0.025121492
## 2 TPM 32.76783 0.3608942 3.290075e-03 0.062778062
## 3 TPM 2059.81139 0.3107556 7.586541e-05 0.003860243
## 4 TPM 17.90961 -0.5333992 1.156826e-03 0.030092651
## 5 TPM 177.45680 0.6797485 1.237455e-04 0.005807258
## 6 TPM 16587.89462 -0.2146729 7.861174e-05 0.003985386
## Input_Counts Shrinkage
## 1 Counts Ashr
## 2 Counts Ashr
## 3 Counts Ashr
## 4 Counts Ashr
## 5 Counts Ashr
## 6 Counts Ashr
## [1] 915 12
# Convert a list of data frames to one single data frame
lfcTable <- sigList[[1]]
for (i in 2:length(shrinkage)) {
lfcTable <- rbind(lfcTable, sigList[[i]])
}
# Explore the output data frame
head(lfcTable)
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSMUSG00000000037 119.57069 0.7900807 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125 32.58857 1.6829572 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278 2045.98646 0.3824490 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303 17.85736 -1.9802205 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308 176.27983 0.8913243 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409 16477.95193 -0.2448023 6.555411e-05 0.003512334
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 116.25308 0.8039195 9.150871e-04 0.025121492
## 2 TPM 32.76783 1.6985679 3.290075e-03 0.062778062
## 3 TPM 2059.81139 0.3823293 7.586541e-05 0.003860243
## 4 TPM 17.90961 -1.9770953 1.156826e-03 0.030092651
## 5 TPM 177.45680 0.8927097 1.237455e-04 0.005807258
## 6 TPM 16587.89462 -0.2445521 7.861174e-05 0.003985386
## Input_Counts Shrinkage
## 1 Counts None
## 2 Counts None
## 3 Counts None
## 4 Counts None
## 5 Counts None
## 6 Counts None
dim(lfcTable)
## [1] 3660 12
# Calculate differences between TPM and Counts input data
# in baseMean, log2FoldChange, and padj
lfcTable <- lfcTable %>%
mutate(mean_TC=baseMean_TPM - baseMean_Counts,
lfc_TC=log2FoldChange_TPM - log2FoldChange_Counts,
FDR_TC=padj_TPM - padj_Counts) %>%
# Add a column storing the number of alternative transcripts
left_join(AnnoDb.ntrans, by="GENEID")
# Explore the output data frame
head(lfcTable)
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSMUSG00000000037 119.57069 0.7900807 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125 32.58857 1.6829572 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278 2045.98646 0.3824490 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303 17.85736 -1.9802205 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308 176.27983 0.8913243 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409 16477.95193 -0.2448023 6.555411e-05 0.003512334
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 116.25308 0.8039195 9.150871e-04 0.025121492
## 2 TPM 32.76783 1.6985679 3.290075e-03 0.062778062
## 3 TPM 2059.81139 0.3823293 7.586541e-05 0.003860243
## 4 TPM 17.90961 -1.9770953 1.156826e-03 0.030092651
## 5 TPM 177.45680 0.8927097 1.237455e-04 0.005807258
## 6 TPM 16587.89462 -0.2445521 7.861174e-05 0.003985386
## Input_Counts Shrinkage mean_TC lfc_TC FDR_TC num.trans
## 1 Counts None 3.31761219 -0.0138387970 6.365386e-03 9
## 2 Counts None -0.17926424 -0.0156107344 9.362617e-03 1
## 3 Counts None -13.82492665 0.0001196663 -1.110363e-05 1
## 4 Counts None -0.05225284 -0.0031251921 3.431740e-03 3
## 5 Counts None -1.17697397 -0.0013854348 7.437539e-04 10
## 6 Counts None -109.94268956 -0.0002502272 -4.730519e-04 8
dim(lfcTable)
## [1] 3660 16
# Set a function to create a vector storing plot labels
plotlabels.fn <- function(myvec, mylist, num) {
vec <- c()
for (i in 1:num) {
vec <- c(vec, c(myvec[i], rep("", nrow(mylist[[i]]) - 1)))
}
return(vec)
}
my.param <- c("baseMean", "log2FoldChange", "padj")
# Slice and clean the data frame for input
lfcTable.comp <- lfcTable %>%
dplyr::select(GENEID, num.trans, Shrinkage, starts_with(my.param)) %>%
gather(Category, Value, -GENEID, -num.trans, -Shrinkage) %>%
separate(Category, c("Metric", "Input"), sep="_") %>%
pivot_wider(names_from=Input, values_from=Value) %>%
nest(-Metric, -Shrinkage)
# Create a vector storing Rsquared values between TPM and Counts outputs
corr.vec <- round(map_dbl(lfcTable.comp$data,
~cor(.x$TPM, .x$Counts)),
7)
# Create a ggplot labeling vector converted from the Rsquared vector
rsq.vec <- plotlabels.fn(corr.vec,
lfcTable.comp$data,
length(corr.vec))
# Unnest the data frame
lfcTable.comp <- lfcTable.comp %>%
unnest(data)
# Add a column storing Rsquared labels
lfcTable.comp$Rsquared.label <- rsq.vec
# Explore the cleaned data frame
head(lfcTable.comp)
## # A tibble: 6 x 7
## Shrinkage Metric GENEID num.trans TPM Counts Rsquared.label
## <chr> <chr> <chr> <int> <dbl> <dbl> <chr>
## 1 None baseMean ENSMUSG00000000037 9 120. 116. "0.9999785"
## 2 None baseMean ENSMUSG00000000125 1 32.6 32.8 ""
## 3 None baseMean ENSMUSG00000000278 1 2046. 2060. ""
## 4 None baseMean ENSMUSG00000000303 3 17.9 17.9 ""
## 5 None baseMean ENSMUSG00000000308 10 176. 177. ""
## 6 None baseMean ENSMUSG00000000409 8 16478. 16588. ""
# Nest the data frame by metric
lfcTable.comp <- lfcTable.comp %>%
nest(-Metric)
# Set a function creating a scatter plot
comp.scatter.fn <- function(df,
xvar,
yvar,
met,
xlabel,
ylabel) {
p <- ggplot(df, aes(x=xvar,
y=yvar,
color=log(num.trans),
label=Rsquared.label)) +
geom_point(alpha=0.5) +
theme_bw() +
ggtitle(paste("Comparison in", met, "\n(with R-Squared)")) +
geom_text(size=5,
mapping=aes(x=Inf, y=Inf),
vjust=2, hjust=3.8, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black") +
scale_color_gradient(low="blue", high="red") +
facet_wrap(~Shrinkage, scales="free") +
theme(strip.text.x=element_text(size=10)) +
xlab(paste("Input from", xlabel)) +
ylab(paste("Input from", ylabel))
return(p)
}
# Print the plots
for (i in 1:nrow(lfcTable.comp)) {
df <- lfcTable.comp$data[[i]]
df$Shrinkage <- factor(df$Shrinkage, levels=unique(df$Shrinkage))
print(comp.scatter.fn(df,
df$TPM,
df$Counts,
lfcTable.comp$Metric[i], "TPM", "Counts"))
}
# Transform the data frame
lfcTable.rank <- lfcTable.comp %>%
unnest(data) %>%
nest(-Metric, -Shrinkage)
# Clean and arrange the data frame
for (i in 1:length(lfcTable.rank$data)) {
df <- lfcTable.rank$data[[i]]
# Create a vector storing rank (1 to the last)
rank.vec <- 1:nrow(df)
# Make descending order if the metric = "padj" and assigne rankings
if (lfcTable.rank$Metric[[i]] == "padj") {
df <- df %>%
dplyr::arrange(TPM) %>%
mutate(rank.TPM=rank.vec) %>%
dplyr::arrange(Counts) %>%
mutate(rank.Counts=rank.vec)
# Make asending order otherwise and assign rankings
} else {
df <- df %>%
dplyr::arrange(desc(abs(TPM))) %>%
mutate(rank.TPM=rank.vec) %>%
dplyr::arrange(desc(abs(Counts))) %>%
mutate(rank.Counts=rank.vec)
}
# Save the updated data frame
lfcTable.rank$data[[i]] <- df
}
# Create a vector storing Rsquared values between TPM and Counts outputs
corr.vec <- round(map_dbl(lfcTable.rank$data,
~cor(.x$rank.TPM, .x$rank.Counts)),
7)
# Create a ggplot labeling vector converted from the Rsquared vector
rsq.vec <- plotlabels.fn(corr.vec, lfcTable.rank$data, length(corr.vec))
# Unnest the data frame
lfcTable.rank <- lfcTable.rank %>%
unnest(data)
# Add a column storing Rsquared labels
lfcTable.rank$Rsquared.label <- rsq.vec
# Explore the cleaned data frame
head(lfcTable.rank)
## # A tibble: 6 x 9
## Metric Shrinkage GENEID num.trans TPM Counts Rsquared.label rank.TPM
## <chr> <chr> <chr> <int> <dbl> <dbl> <chr> <int>
## 1 baseMe… None ENSMUSG0000… 3 72567. 73053. "0.9999016" 1
## 2 baseMe… None ENSMUSG0000… 3 53754. 54115. "" 2
## 3 baseMe… None ENSMUSG0000… 11 53183. 53541. "" 3
## 4 baseMe… None ENSMUSG0000… 4 46583. 46896. "" 4
## 5 baseMe… None ENSMUSG0000… 1 43174. 43432. "" 5
## 6 baseMe… None ENSMUSG0000… 3 38943. 39207. "" 6
## # … with 1 more variable: rank.Counts <int>
# Nest the data frame by metric
lfcTable.rank <- lfcTable.rank %>%
nest(-Metric)
# Print the plots
for (i in 1:nrow(lfcTable.rank)) {
df <- lfcTable.rank$data[[i]]
df$Shrinkage <- factor(df$Shrinkage, levels=unique(df$Shrinkage))
pl <- comp.scatter.fn(df,
df$rank.TPM,
df$rank.Counts,
lfcTable.rank$Metric[i],
"TPM", "Counts") +
ggtitle(paste("Comparison in", lfcTable.rank$Metric[i], "Rank\n(with R-Squared)"))
print(pl)
}
# Set a function cleaning data to generate upset plots
upset.input.fn <- function(df) {
df <- df %>%
# Filter genes with valid padj
subset(!is.na(padj)) %>%
mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes?
Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""), # What are downregulated genes?
Unchanged=ifelse(FDR == FDRv[2], Gene, ""), # What are unchanged genes?
TPM_Input=ifelse(Input == "TPM", Gene, ""), # What are the genes from TPM input?
Counts_Input=ifelse(Input == "Counts", Gene, "")) # What are the genes from Counts input?
# Create a list storing groups of interest
upsetInput <- list(Up=df$Up,
Down=df$Down,
Unchanged=df$Unchanged,
TPM_Input=df$TPM,
Counts_Input=df$Counts)
return(upsetInput)
}
upsetList <- upset.input.fn(both.df)
# Create the upset plot
upset(fromList(upsetList),
sets=names(upsetList), # What group to display
sets.x.label="Number of Genes per Group",
order.by="freq",
point.size=3,
sets.bar.color=c("red", "red","blue", "blue", "blue"),
text.scale = 1.5, number.angles=30)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /home/mira/miniconda3/envs/r/lib/libblas.so.3.8.0
## LAPACK: /home/mira/miniconda3/envs/r/lib/liblapack.so.3.8.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ashr_2.2-47 apeglm_1.12.0
## [3] ensembldb_2.14.0 AnnotationFilter_1.14.0
## [5] GenomicFeatures_1.42.1 AnnotationDbi_1.52.0
## [7] UpSetR_1.4.0 pheatmap_1.0.12
## [9] gridExtra_2.3 DESeq2_1.30.1
## [11] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [13] MatrixGenerics_1.2.0 matrixStats_0.58.0
## [15] GenomicRanges_1.42.0 GenomeInfoDb_1.26.0
## [17] IRanges_2.24.1 S4Vectors_0.28.1
## [19] tximport_1.18.0 forcats_0.5.1
## [21] stringr_1.4.0 dplyr_1.0.5
## [23] purrr_0.3.4 readr_1.4.0
## [25] tidyr_1.1.3 tibble_3.1.0
## [27] ggplot2_3.3.3 tidyverse_1.3.0
## [29] AnnotationHub_2.22.0 BiocFileCache_1.14.0
## [31] dbplyr_2.1.0 BiocGenerics_0.36.0
## [33] rmarkdown_2.7 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1
## [3] plyr_1.8.6 lazyeval_0.2.2
## [5] splines_4.0.3 BiocParallel_1.24.0
## [7] digest_0.6.27 invgamma_1.1
## [9] htmltools_0.5.1.1 SQUAREM_2021.1
## [11] fansi_0.4.2 magrittr_2.0.1
## [13] memoise_2.0.0 Biostrings_2.58.0
## [15] annotate_1.68.0 modelr_0.1.8
## [17] askpass_1.1 bdsmatrix_1.3-4
## [19] prettyunits_1.1.1 colorspace_2.0-0
## [21] blob_1.2.1 rvest_0.3.6
## [23] rappdirs_0.3.3 haven_2.3.1
## [25] xfun_0.21 crayon_1.4.1
## [27] RCurl_1.98-1.2 jsonlite_1.7.2
## [29] genefilter_1.72.1 survival_3.2-7
## [31] glue_1.4.2 gtable_0.3.0
## [33] zlibbioc_1.36.0 XVector_0.30.0
## [35] DelayedArray_0.16.0 scales_1.1.1
## [37] mvtnorm_1.1-1 DBI_1.1.1
## [39] Rcpp_1.0.6 xtable_1.8-4
## [41] progress_1.2.2 emdbook_1.3.12
## [43] bit_4.0.4 truncnorm_1.0-8
## [45] httr_1.4.2 RColorBrewer_1.1-2
## [47] ellipsis_0.3.1 farver_2.0.3
## [49] pkgconfig_2.0.3 XML_3.99-0.5
## [51] sass_0.3.1 locfit_1.5-9.4
## [53] utf8_1.1.4 labeling_0.4.2
## [55] tidyselect_1.1.0 rlang_0.4.10
## [57] later_1.1.0.1 munsell_0.5.0
## [59] BiocVersion_3.12.0 cellranger_1.1.0
## [61] tools_4.0.3 cachem_1.0.4
## [63] cli_2.3.1 generics_0.1.0
## [65] RSQLite_2.2.3 broom_0.7.5
## [67] evaluate_0.14 fastmap_1.1.0
## [69] yaml_2.2.1 knitr_1.31
## [71] bit64_4.0.5 fs_1.5.0
## [73] mime_0.10 xml2_1.3.2
## [75] biomaRt_2.46.3 compiler_4.0.3
## [77] rstudioapi_0.13 curl_4.3
## [79] interactiveDisplayBase_1.28.0 reprex_1.0.0
## [81] geneplotter_1.68.0 bslib_0.2.4
## [83] stringi_1.5.3 highr_0.8
## [85] ps_1.5.0 lattice_0.20-41
## [87] ProtGenerics_1.22.0 Matrix_1.3-2
## [89] vctrs_0.3.6 pillar_1.5.0
## [91] lifecycle_1.0.0 BiocManager_1.30.10
## [93] jquerylib_0.1.3 irlba_2.3.3
## [95] bitops_1.0-6 httpuv_1.5.5
## [97] rtracklayer_1.50.0 R6_2.5.0
## [99] promises_1.2.0.1 MASS_7.3-53.1
## [101] assertthat_0.2.1 openssl_1.4.3
## [103] withr_2.4.1 GenomicAlignments_1.26.0
## [105] Rsamtools_2.6.0 GenomeInfoDbData_1.2.4
## [107] hms_1.0.0 grid_4.0.3
## [109] coda_0.19-4 mixsqp_0.3-43
## [111] bbmle_1.0.23.1 numDeriv_2016.8-1.1
## [113] shiny_1.6.0 lubridate_1.7.10